In [1]:
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
In [2]:
import datetime
In [3]:
import requests
def get_yahoo_page_html(url):
html = requests.get(url, headers={'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/53.0.2785.143 Safari/537.36'})
return html.text
In [4]:
def getDataOnePage(html):
targetTableIndex = 0
table = pd.read_html(html,
attrs = {'border': '1' ,
'cellspacing': '0',
'cellpadding': '2',
'bgcolor': '#ffffff'},
header = 0
)[targetTableIndex]
return table
In [5]:
def getDataOnePageTSE(page):
url = 'https://tw.stock.yahoo.com/s/list.php?c=tse&pid=' + str(page)
return getDataOnePage(html = get_yahoo_page_html(url))
In [6]:
# 抓第一頁的資料
df = getDataOnePageTSE(1)
In [7]:
df.tail()
Out[7]:
In [8]:
df.to_excel('stock.xlsx')
In [9]:
def fixTable(marketType, table, theDate = datetime.date.today()):
fixedTable = table
# Drop
fixedTable.drop(['選擇', '凱基證券下單'], axis = 1, inplace = True)
fixedTable.dropna(axis=0, how='all', inplace=True)
# fill missing data
fixedTable['股票代號名稱'] = fixedTable['股票代號']
fixedTable['股票代號'] = fixedTable['股票代號名稱'].map(lambda x: x.split()[0])
fixedTable['股票名稱'] = fixedTable['股票代號名稱'].map(lambda x: x.split()[1])
fixedTable['日期'] = theDate
fixedTable['市場別'] = marketType
# data type
fixedTable.replace('-', np.nan, inplace = True)
fixedTable['股票代號'] = fixedTable['股票代號'].astype(str)
fixedTable['時間'] = fixedTable['時間'].astype(datetime.time)
fixedTable[['成交', '買進', '賣出', '張數', '昨收', '開盤', '最高', '最低']] = \
fixedTable[['成交', '買進', '賣出', '張數', '昨收', '開盤', '最高', '最低']].astype(float)
fixedTable['漲跌'] = fixedTable['成交'] - fixedTable['昨收']
fixedTable['漲跌'] = fixedTable['漲跌'].map(lambda x: round(x, 2))
# sort
# fixedTable.sort_values(by = '股票代號', inplace = True)
# indexing
fixedTable.index = Series(range(len(fixedTable)))
fixedTable.index.name = '項次'
fixedTable = fixedTable.reindex(columns = ['市場別', '股票代號', '股票名稱', '日期', '時間', '成交', '買進', '賣出', '漲跌', '張數', '昨收', '開盤', '最高', '最低'])
return fixedTable
In [10]:
df1 = fixTable('TSE', df)
In [11]:
df1.tail(5)
Out[11]:
In [12]:
# 抓第一頁~第五頁的資料
dfs = map(lambda p: fixTable('TSE', getDataOnePageTSE(p)) , range(1, 6))
In [13]:
# Append 在一起
df = pd.concat(dfs)
len(df)
Out[13]:
In [14]:
df.index = pd.Index(range(len(df))) # 重新編排 row index 編號
df = df[df['股票代號'].str.len() <= 4] # 濾除 權證 資料
df.tail()
Out[14]:
In [15]:
df_類股 = pd.read_excel('..\\data\個股_類別.xls') # 需先解壓縮 個股_類別.rar
df_類股.tail()
Out[15]:
In [16]:
mdf = df.merge(df_類股, left_on = '股票代號', right_on = '個股_代號', how = 'left') # merge
mdf = mdf.drop(['市場別_ID', '個股_代號', '個股_名稱'], axis = 1) # drop 多於的欄位
mdf.tail()
Out[16]:
In [17]:
# 各類股有多少支個股
mdf.groupby(['類股_名稱']).size().sort_index()
Out[17]:
In [18]:
# 各類股 平均股價
mdf.groupby(['類股_名稱'])['成交'].mean().sort_index()
Out[18]: